Primary analyses
Load modeling data
Here, we merge the results from the neural language model analyses and merge it with our norming data.
df_distances = read_csv("../../data/processed/stims_with_nlm_distances.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## Class = col_character(),
## ambiguity_type = col_character(),
## ambiguity_type_mw = col_character(),
## ambiguity_type_oed = col_character(),
## different_frame = col_character(),
## distance_bert = col_double(),
## distance_elmo = col_double(),
## overlap = col_character(),
## same = col_logical(),
## source = col_character(),
## string = col_character(),
## version = col_character(),
## word = col_character()
## )
nrow(df_distances)
## [1] 690
df_merged = df_normed_critical %>%
left_join(df_distances, by = c("word", "version", "string", "overlap",
"source", "same", "Class", "ambiguity_type"))
nrow(df_merged)
## [1] 8855
length(unique(df_merged$subject))
## [1] 77
df_merged = df_merged %>%
filter(ambiguity_type != "Unsure")
nrow(df_merged)
## [1] 8624
Inter-annotator agreement
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)
df_by_ppt = df_merged %>%
select(word, same, ambiguity_type, relatedness, subject, version) %>%
pivot_wider(id_cols = c(word, same, ambiguity_type, version),
names_from = subject,
values_from = relatedness)
## Compare to: df_norms_final
ppts = unique(df_merged$subject)
df_r = data.frame()
for (p1 in ppts) {
# Recalculate norms without subject
df_norms_without_subject = df_merged %>%
filter(subject != p1) %>%
group_by(word, same, ambiguity_type, version, Class) %>%
summarise(mean_relatedness = mean(relatedness),
median_relatedness = median(relatedness),
diff = abs(mean_relatedness - median_relatedness),
count = n(),
sd_relatedness = sd(relatedness),
distance_bert = mean(distance_bert),
distance_elmo = mean(distance_elmo),
se_relatedness = sd_relatedness / sqrt(n()))
df_joined = df_merged %>%
select(word, same, ambiguity_type, relatedness, subject, version) %>%
filter(subject == p1) %>%
left_join(df_norms_without_subject, by = c("word", "same", "ambiguity_type", "version"))
test = cor.test(df_joined$relatedness,
df_joined$mean_relatedness,
method = "spearman")
df_test = broom::tidy(test)
df_test$p1 = p1
df_r = rbind(df_r, df_test)
}
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
## Warning in cor.test.default(df_joined$relatedness, df_joined$mean_relatedness, :
## Cannot compute exact p-value with ties
df_r %>%
ggplot(aes(x = estimate)) +
geom_histogram(alpha = .7) +
geom_vline(xintercept = mean(df_r$estimate, na.rm= TRUE),
linetype = "dotted") +
geom_vline(xintercept = .58,
linetype = "dashed") +
scale_x_continuous(limits = c(0, 1)) +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
mean(df_r$estimate, na.rm = TRUE)
## [1] 0.793326
median(df_r$estimate, na.rm = TRUE)
## [1] 0.8122813
sd(df_r$estimate, na.rm = TRUE)
## [1] 0.07448986
range(df_r$estimate)
## [1] 0.5453384 0.8821422
H3: Do ELMo/BERT explain independent variance?
model_no_bert = lmer(data = df_merged,
relatedness ~ same * ambiguity_type +
distance_elmo +
Class +
(1 + same + ambiguity_type | subject) +
(1 + same | word),
control=lmerControl(optimizer="bobyqa"),
REML = FALSE)
anova(model_interaction, model_no_bert)
## Data: df_merged
## Models:
## model_no_bert: relatedness ~ same * ambiguity_type + distance_elmo + Class +
## model_no_bert: (1 + same + ambiguity_type | subject) + (1 + same | word)
## model_interaction: relatedness ~ same * ambiguity_type + distance_bert + distance_elmo +
## model_interaction: Class + (1 + same + ambiguity_type | subject) + (1 + same |
## model_interaction: word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_no_bert 16 24342 24455 -12155 24310
## model_interaction 17 24307 24428 -12137 24273 36.192 1 1.788e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_no_elmo = lmer(data = df_merged,
relatedness ~ same * ambiguity_type +
distance_bert +
Class +
(1 + same + ambiguity_type | subject) +
(1 + same | word),
control=lmerControl(optimizer="bobyqa"),
REML = FALSE)
anova(model_interaction, model_no_elmo)
## Data: df_merged
## Models:
## model_no_elmo: relatedness ~ same * ambiguity_type + distance_bert + Class +
## model_no_elmo: (1 + same + ambiguity_type | subject) + (1 + same | word)
## model_interaction: relatedness ~ same * ambiguity_type + distance_bert + distance_elmo +
## model_interaction: Class + (1 + same + ambiguity_type | subject) + (1 + same |
## model_interaction: word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_no_elmo 16 24322 24435 -12145 24290
## model_interaction 17 24307 24428 -12137 24273 16.924 1 3.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Discussion
It appears that Ambiguity Type explains variance in relatedness above and beyond that already explained by cosine distance and same sense. In particular, different sense homonyms appear to be judged as less related, on average, than different sense polysems (which span a wider range).
df_merged %>%
ggplot(aes(x = relatedness)) +
geom_histogram(bins = 5) +
theme_minimal() +
facet_wrap(~same + ambiguity_type)
df_merged %>%
ggplot(aes(x = relatedness)) +
geom_histogram(bins = 5,
aes(y = ..density..)) +
theme_minimal() +
facet_wrap(~same + ambiguity_type,
ncol = 2)
df_merged %>%
ggplot(aes(x = relatedness,
color = same)) +
geom_freqpoly(bins = 5) +
theme_minimal() +
facet_wrap(~ambiguity_type, ncol = 1)
df_merged %>%
ggplot(aes(x = relatedness,
color = same)) +
geom_freqpoly(bins = 5,
aes(y = ..density..)) +
theme_minimal() +
facet_wrap(~ambiguity_type, ncol = 1)